The data set was import from: NIH - National Cancer Institute (SEER)
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, f1_score, precision_score, recall_score, accuracy_score
from sklearn.tree import export_text
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.tree import plot_tree
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split, cross_validate, cross_val_score
from scipy.stats import randint
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, accuracy_score
from IPython.display import display
data = pd.read_csv('data.csv')
print(data.head())
diagnosis radius_mean texture_mean perimeter_mean area_mean \ 0 M 17.99 10.38 122.80 1001.0 1 M 20.57 17.77 132.90 1326.0 2 M 19.69 21.25 130.00 1203.0 3 M 11.42 20.38 77.58 386.1 4 M 20.29 14.34 135.10 1297.0 smoothness_mean compactness_mean concavity_mean concave points_mean \ 0 0.11840 0.27760 0.3001 0.14710 1 0.08474 0.07864 0.0869 0.07017 2 0.10960 0.15990 0.1974 0.12790 3 0.14250 0.28390 0.2414 0.10520 4 0.10030 0.13280 0.1980 0.10430 symmetry_mean ... concavity_worst concave points_worst symmetry_worst \ 0 0.2419 ... 0.7119 0.2654 0.4601 1 0.1812 ... 0.2416 0.1860 0.2750 2 0.2069 ... 0.4504 0.2430 0.3613 3 0.2597 ... 0.6869 0.2575 0.6638 4 0.1809 ... 0.4000 0.1625 0.2364 fractal_dimension_worst age stage_of_cancer treatment_administered \ 0 0.11890 74 Stage 1 Radiation 1 0.08902 77 Stage 1 Surgery 2 0.08758 83 Stage 4 Hormonal 3 0.17300 30 Stage 3 Radiation 4 0.07678 33 Stage 2 Hormonal duration censor survival_probability 0 20 1 0.829205 1 67 0 0.780743 2 93 0 0.507499 3 38 1 0.480011 4 62 1 0.485004 [5 rows x 37 columns]
data.replace('?', pd.NA, inplace=True)
data.dropna(inplace=True)
data.shape
data.describe()
(569, 37)
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | age | duration | censor | survival_probability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
| mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 | ... | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 | 59.641476 | 63.465729 | 0.794376 | 0.573025 |
| std | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 | ... | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 | 17.749472 | 32.925748 | 0.404512 | 0.143626 |
| min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 | ... | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 | 30.000000 | 6.000000 | 0.000000 | 0.204240 |
| 25% | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | 0.057700 | ... | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 | 45.000000 | 34.000000 | 1.000000 | 0.467546 |
| 50% | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | 0.061540 | ... | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 | 60.000000 | 63.000000 | 1.000000 | 0.576885 |
| 75% | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | 0.066120 | ... | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 | 75.000000 | 92.000000 | 1.000000 | 0.689974 |
| max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 | ... | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 | 90.000000 | 120.000000 | 1.000000 | 0.866458 |
8 rows × 34 columns
As we can see there are no missing values, "SEER" datasets shoul'd be clean and without missing values.
Acording to the article, the goal is to predict whether a patient survived or not.
"1" = Survived , "0" = Didn't Survived
data['target'] = (data['survival_probability'] > 0.5).astype(int)
Binary_data = data.copy()
data = data.drop('survival_probability', axis=1)
data = data.drop('censor', axis=1)
data.to_csv('data_01.csv', index=False)
data.head()
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | age | stage_of_cancer | treatment_administered | duration | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | ... | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 74 | Stage 1 | Radiation | 20 | 1 |
| 1 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | ... | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 77 | Stage 1 | Surgery | 67 | 1 |
| 2 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | ... | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 83 | Stage 4 | Hormonal | 93 | 1 |
| 3 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | ... | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 30 | Stage 3 | Radiation | 38 | 0 |
| 4 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | ... | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 33 | Stage 2 | Hormonal | 62 | 0 |
5 rows × 36 columns
We made sure to preserve target variable distribution using Stratification Parameter.
X = data.drop(columns=['target'])
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)
print(f"Stratification of train set: {y_train.mean()}")
print(f"Stratification of test set: {y_test.mean()}")
Stratification of train set: 0.6659340659340659 Stratification of test set: 0.6666666666666666
Exports the sets
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')
objectAttributesKey = ["diagnosis", "stage_of_cancer", "treatment_administered"]
integerAttributesKey = ["age"]
data[objectAttributesKey] = data[objectAttributesKey].astype('category')
data[integerAttributesKey] = data[integerAttributesKey].astype(int)
data = pd.get_dummies(data, columns=objectAttributesKey, drop_first=True)
data.to_csv('train_data_01.csv', index=False)
data.head()
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | duration | target | diagnosis_M | stage_of_cancer_Stage 2 | stage_of_cancer_Stage 3 | stage_of_cancer_Stage 4 | treatment_administered_Combination | treatment_administered_Hormonal | treatment_administered_Radiation | treatment_administered_Surgery | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | ... | 20 | 1 | True | False | False | False | False | False | True | False |
| 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | ... | 67 | 1 | True | False | False | False | False | False | False | True |
| 2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | ... | 93 | 1 | True | False | False | True | False | True | False | False |
| 3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | ... | 38 | 0 | True | False | True | False | False | False | True | False |
| 4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | ... | 62 | 0 | True | True | False | False | False | True | False | False |
5 rows × 41 columns
We will check for the distribution of 'target'
train_data['target'].value_counts()
| count | |
|---|---|
| target | |
| 1 | 303 |
| 0 | 152 |
we'll calculate the 1.5 times the Interquartile Range (IQR) and visualize them using boxplots.
train_data['target'] = train_data['target'].map({1: 'Survived', 0: "Didn't Survived"})
numerical_columns = train_data.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = [col for col in numerical_columns if col != 'target']
sns.set(rc={'figure.figsize': (6, 3)})
sns.set_context("talk")
for column in numerical_columns:
plt.figure(figsize=(6, 3))
sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
plt.title(f'Boxplot of {column} vs Target', fontsize=16)
plt.xlabel('target', fontsize=14)
plt.ylabel(column, fontsize=14)
plt.show()
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='radius_mean'>
Text(0.5, 1.0, 'Boxplot of radius_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'radius_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='texture_mean'>
Text(0.5, 1.0, 'Boxplot of texture_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'texture_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='perimeter_mean'>
Text(0.5, 1.0, 'Boxplot of perimeter_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'perimeter_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='area_mean'>
Text(0.5, 1.0, 'Boxplot of area_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'area_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='smoothness_mean'>
Text(0.5, 1.0, 'Boxplot of smoothness_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'smoothness_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='compactness_mean'>
Text(0.5, 1.0, 'Boxplot of compactness_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'compactness_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='concavity_mean'>
Text(0.5, 1.0, 'Boxplot of concavity_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'concavity_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='concave points_mean'>
Text(0.5, 1.0, 'Boxplot of concave points_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'concave points_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='symmetry_mean'>
Text(0.5, 1.0, 'Boxplot of symmetry_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'symmetry_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='fractal_dimension_mean'>
Text(0.5, 1.0, 'Boxplot of fractal_dimension_mean vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'fractal_dimension_mean')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='radius_se'>
Text(0.5, 1.0, 'Boxplot of radius_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'radius_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='texture_se'>
Text(0.5, 1.0, 'Boxplot of texture_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'texture_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='perimeter_se'>
Text(0.5, 1.0, 'Boxplot of perimeter_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'perimeter_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='area_se'>
Text(0.5, 1.0, 'Boxplot of area_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'area_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='smoothness_se'>
Text(0.5, 1.0, 'Boxplot of smoothness_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'smoothness_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='compactness_se'>
Text(0.5, 1.0, 'Boxplot of compactness_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'compactness_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='concavity_se'>
Text(0.5, 1.0, 'Boxplot of concavity_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'concavity_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='concave points_se'>
Text(0.5, 1.0, 'Boxplot of concave points_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'concave points_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='symmetry_se'>
Text(0.5, 1.0, 'Boxplot of symmetry_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'symmetry_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='fractal_dimension_se'>
Text(0.5, 1.0, 'Boxplot of fractal_dimension_se vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'fractal_dimension_se')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='radius_worst'>
Text(0.5, 1.0, 'Boxplot of radius_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'radius_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='texture_worst'>
Text(0.5, 1.0, 'Boxplot of texture_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'texture_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='perimeter_worst'>
Text(0.5, 1.0, 'Boxplot of perimeter_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'perimeter_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='area_worst'>
Text(0.5, 1.0, 'Boxplot of area_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'area_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='smoothness_worst'>
Text(0.5, 1.0, 'Boxplot of smoothness_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'smoothness_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='compactness_worst'>
Text(0.5, 1.0, 'Boxplot of compactness_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'compactness_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='concavity_worst'>
Text(0.5, 1.0, 'Boxplot of concavity_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'concavity_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='concave points_worst'>
Text(0.5, 1.0, 'Boxplot of concave points_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'concave points_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='symmetry_worst'>
Text(0.5, 1.0, 'Boxplot of symmetry_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'symmetry_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='fractal_dimension_worst'>
Text(0.5, 1.0, 'Boxplot of fractal_dimension_worst vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'fractal_dimension_worst')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='age'>
Text(0.5, 1.0, 'Boxplot of age vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'age')
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
<Axes: xlabel='target', ylabel='duration'>
Text(0.5, 1.0, 'Boxplot of duration vs Target')
Text(0.5, 0, 'target')
Text(0, 0.5, 'duration')
It appears that there are some patients with more then one outlier, we'll remove them from the data set.
train_data = pd.read_csv('train_data.csv')
numerical_columns = train_data.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = [col for col in numerical_columns if col != 'target']
outliers_mask = pd.Series([False] * len(train_data))
for column in numerical_columns:
Q1 = train_data[column].quantile(0.25)
Q3 = train_data[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_mask |= (train_data[column] < lower_bound) | (train_data[column] > upper_bound)
train_data = train_data[~outliers_mask]
rows_removed = outliers_mask.sum()
print(f"Number of patients removed: {rows_removed}")
train_data.to_csv('train_data_02.csv')
Number of patients removed: 138
fig, axes = plt.subplots(10, 4, figsize=(40, 40))
axes = axes.flatten()
for index, columnName in enumerate(train_data.columns[1:]):
ax = axes[index]
if train_data[columnName].dtype == 'object':
sns.countplot(x=columnName, data=train_data, ax=ax)
else:
sns.histplot(x=columnName, data=train_data, ax=ax)
ax.set_title(columnName)
plt.tight_layout()
plt.show()
numerical_features = train_data.select_dtypes(include=['int64', 'float64']).columns[1:]
skewed_features = train_data[numerical_features].skew().sort_values(ascending=False)
skewness_df = pd.DataFrame({'Skew': skewed_features})
print(skewness_df)
<Axes: xlabel='diagnosis', ylabel='count'>
Text(0.5, 1.0, 'diagnosis')
<Axes: xlabel='radius_mean', ylabel='Count'>
Text(0.5, 1.0, 'radius_mean')
<Axes: xlabel='texture_mean', ylabel='Count'>
Text(0.5, 1.0, 'texture_mean')
<Axes: xlabel='perimeter_mean', ylabel='Count'>
Text(0.5, 1.0, 'perimeter_mean')
<Axes: xlabel='area_mean', ylabel='Count'>
Text(0.5, 1.0, 'area_mean')
<Axes: xlabel='smoothness_mean', ylabel='Count'>
Text(0.5, 1.0, 'smoothness_mean')
<Axes: xlabel='compactness_mean', ylabel='Count'>
Text(0.5, 1.0, 'compactness_mean')
<Axes: xlabel='concavity_mean', ylabel='Count'>
Text(0.5, 1.0, 'concavity_mean')
<Axes: xlabel='concave points_mean', ylabel='Count'>
Text(0.5, 1.0, 'concave points_mean')
<Axes: xlabel='symmetry_mean', ylabel='Count'>
Text(0.5, 1.0, 'symmetry_mean')
<Axes: xlabel='fractal_dimension_mean', ylabel='Count'>
Text(0.5, 1.0, 'fractal_dimension_mean')
<Axes: xlabel='radius_se', ylabel='Count'>
Text(0.5, 1.0, 'radius_se')
<Axes: xlabel='texture_se', ylabel='Count'>
Text(0.5, 1.0, 'texture_se')
<Axes: xlabel='perimeter_se', ylabel='Count'>
Text(0.5, 1.0, 'perimeter_se')
<Axes: xlabel='area_se', ylabel='Count'>
Text(0.5, 1.0, 'area_se')
<Axes: xlabel='smoothness_se', ylabel='Count'>
Text(0.5, 1.0, 'smoothness_se')
<Axes: xlabel='compactness_se', ylabel='Count'>
Text(0.5, 1.0, 'compactness_se')
<Axes: xlabel='concavity_se', ylabel='Count'>
Text(0.5, 1.0, 'concavity_se')
<Axes: xlabel='concave points_se', ylabel='Count'>
Text(0.5, 1.0, 'concave points_se')
<Axes: xlabel='symmetry_se', ylabel='Count'>
Text(0.5, 1.0, 'symmetry_se')
<Axes: xlabel='fractal_dimension_se', ylabel='Count'>
Text(0.5, 1.0, 'fractal_dimension_se')
<Axes: xlabel='radius_worst', ylabel='Count'>
Text(0.5, 1.0, 'radius_worst')
<Axes: xlabel='texture_worst', ylabel='Count'>
Text(0.5, 1.0, 'texture_worst')
<Axes: xlabel='perimeter_worst', ylabel='Count'>
Text(0.5, 1.0, 'perimeter_worst')
<Axes: xlabel='area_worst', ylabel='Count'>
Text(0.5, 1.0, 'area_worst')
<Axes: xlabel='smoothness_worst', ylabel='Count'>
Text(0.5, 1.0, 'smoothness_worst')
<Axes: xlabel='compactness_worst', ylabel='Count'>
Text(0.5, 1.0, 'compactness_worst')
<Axes: xlabel='concavity_worst', ylabel='Count'>
Text(0.5, 1.0, 'concavity_worst')
<Axes: xlabel='concave points_worst', ylabel='Count'>
Text(0.5, 1.0, 'concave points_worst')
<Axes: xlabel='symmetry_worst', ylabel='Count'>
Text(0.5, 1.0, 'symmetry_worst')
<Axes: xlabel='fractal_dimension_worst', ylabel='Count'>
Text(0.5, 1.0, 'fractal_dimension_worst')
<Axes: xlabel='age', ylabel='Count'>
Text(0.5, 1.0, 'age')
<Axes: xlabel='stage_of_cancer', ylabel='count'>
Text(0.5, 1.0, 'stage_of_cancer')
<Axes: xlabel='treatment_administered', ylabel='count'>
Text(0.5, 1.0, 'treatment_administered')
<Axes: xlabel='duration', ylabel='Count'>
Text(0.5, 1.0, 'duration')
<Axes: xlabel='target', ylabel='Count'>
Text(0.5, 1.0, 'target')
Now we'll perform Log Transformation for the features that has right skewness and Square Root Transformation to the features that has left skewness.
# Threshold for skewness
threshold = 0.75
def transform_features(train_data, skewed_features, threshold=0.75):
changed_features = {}
for feature in skewed_features.index:
skewness = skewed_features[feature]
original_values = train_data[feature].copy()
if skewness > threshold:
if (train_data[feature] > 0).all():
train_data[feature] = np.log1p(train_data[feature])
else:
print(f"Skipping {feature} due to negative or zero values in the data.")
elif skewness < -threshold:
train_data[feature] = np.sqrt(train_data[feature].abs())
if not train_data[feature].equals(original_values):
changed_features[feature] = {
'original_skewness': skewness,
'new_skewness': train_data[feature].skew()
}
return train_data, changed_features
train_data_transformed, changed_features = transform_features(train_data.copy(), skewed_features, threshold)
if changed_features:
print("\nFeatures with Skewness Changes:")
for feature, skewness_info in changed_features.items():
print(f"Feature: {feature}")
print(f" Original Skewness: {skewness_info['original_skewness']}")
print(f" New Skewness: {skewness_info['new_skewness']}\n")
else:
print("No features had significant changes in skewness.")
if changed_features:
fig, axes = plt.subplots(3, 4, figsize=(40, 40))
axes = axes.flatten() if len(changed_features) > 1 else [axes]
for ax, (feature, _) in zip(axes, changed_features.items()):
sns.histplot(train_data_transformed[feature], ax=ax)
ax.set_title(f"{feature} (Transformed)")
plt.tight_layout()
plt.show()
Skew area_se 1.500621 concavity_mean 1.341517 area_worst 1.259710 fractal_dimension_se 1.152364 area_mean 1.139781 concave points_mean 1.109217 perimeter_se 1.056725 radius_se 1.045883 concavity_se 0.982360 compactness_se 0.938201 compactness_worst 0.927227 concavity_worst 0.905529 symmetry_se 0.823085 radius_worst 0.803780 perimeter_worst 0.779180 compactness_mean 0.727498 fractal_dimension_worst 0.726821 smoothness_se 0.694778 texture_se 0.665351 perimeter_mean 0.634102 radius_mean 0.609254 fractal_dimension_mean 0.545257 concave points_worst 0.535519 symmetry_worst 0.455974 texture_mean 0.377752 texture_worst 0.264952 smoothness_worst 0.241018 concave points_se 0.228033 smoothness_mean 0.184778 symmetry_mean 0.141812 age 0.088668 duration -0.069092 target -0.631235 Skipping concavity_mean due to negative or zero values in the data. Skipping concave points_mean due to negative or zero values in the data. Skipping concavity_se due to negative or zero values in the data. Skipping concavity_worst due to negative or zero values in the data. Features with Skewness Changes: Feature: area_se Original Skewness: 1.500620894091251 New Skewness: 0.37384820311316624 Feature: area_worst Original Skewness: 1.2597098098356303 New Skewness: 0.20756307817678538 Feature: fractal_dimension_se Original Skewness: 1.1523641192918657 New Skewness: 1.14801861775621 Feature: area_mean Original Skewness: 1.139780952552101 New Skewness: -0.010332358714100688 Feature: perimeter_se Original Skewness: 1.056724920701799 New Skewness: 0.43460137178308555 Feature: radius_se Original Skewness: 1.045883338017319 New Skewness: 0.8134414067927772 Feature: compactness_se Original Skewness: 0.938200774575379 New Skewness: 0.9140320773551073 Feature: compactness_worst Original Skewness: 0.9272273016618426 New Skewness: 0.7074258537878015 Feature: symmetry_se Original Skewness: 0.8230851527881472 New Skewness: 0.8096950632228345 Feature: radius_worst Original Skewness: 0.8037804580212827 New Skewness: 0.2781036839218181 Feature: perimeter_worst Original Skewness: 0.779180092711607 New Skewness: 0.22718500225809612
<Axes: xlabel='area_se', ylabel='Count'>
Text(0.5, 1.0, 'area_se (Transformed)')
<Axes: xlabel='area_worst', ylabel='Count'>
Text(0.5, 1.0, 'area_worst (Transformed)')
<Axes: xlabel='fractal_dimension_se', ylabel='Count'>
Text(0.5, 1.0, 'fractal_dimension_se (Transformed)')
<Axes: xlabel='area_mean', ylabel='Count'>
Text(0.5, 1.0, 'area_mean (Transformed)')
<Axes: xlabel='perimeter_se', ylabel='Count'>
Text(0.5, 1.0, 'perimeter_se (Transformed)')
<Axes: xlabel='radius_se', ylabel='Count'>
Text(0.5, 1.0, 'radius_se (Transformed)')
<Axes: xlabel='compactness_se', ylabel='Count'>
Text(0.5, 1.0, 'compactness_se (Transformed)')
<Axes: xlabel='compactness_worst', ylabel='Count'>
Text(0.5, 1.0, 'compactness_worst (Transformed)')
<Axes: xlabel='symmetry_se', ylabel='Count'>
Text(0.5, 1.0, 'symmetry_se (Transformed)')
<Axes: xlabel='radius_worst', ylabel='Count'>
Text(0.5, 1.0, 'radius_worst (Transformed)')
<Axes: xlabel='perimeter_worst', ylabel='Count'>
Text(0.5, 1.0, 'perimeter_worst (Transformed)')
After the transformations, we'll remove the features that still has high skewness
final_skewness = train_data_transformed[numerical_features].skew().sort_values(ascending=False)
features_to_remove = final_skewness[final_skewness.abs() > threshold].index
if len(features_to_remove) > 0:
print(f"\nFeatures to be removed due to high skewness (>{threshold}):")
for feature in features_to_remove:
print(f" - {feature}")
else:
print("\nNo features to remove. All features are within the skewness threshold.")
train_data_cleaned = train_data_transformed.drop(columns=features_to_remove)
print(f"\nNumber of features removed: {len(features_to_remove)}")
train_data_cleaned.to_csv('train_data_03.csv', index=False)
Features to be removed due to high skewness (>0.75): - concavity_mean - fractal_dimension_se - concave points_mean - concavity_se - compactness_se - concavity_worst - radius_se - symmetry_se Number of features removed: 8
Iterative process where we remove one feature at a time, re-calculate the correlation matrix, and then reassess which features to remove next. This approach can help address any changes in correlation dynamics after each feature is removed.
numerical_data = train_data_cleaned.select_dtypes(include=['float64', 'int64'])
correlation_threshold = 0.8
def find_most_correlated(data, threshold):
correlation_matrix = data.corr()
features = correlation_matrix.columns
for i in range(len(features)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > threshold:
return (features[i], features[j])
return None
while True:
pair = find_most_correlated(numerical_data, correlation_threshold)
if pair is None:
break
feature_to_remove = pair[0]
numerical_data = numerical_data.drop(columns=[feature_to_remove])
print(f"Removed {feature_to_remove} due to high correlation with {pair[1]}")
plt.figure(figsize=(20, 20))
sns.heatmap(numerical_data.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Final Correlation Matrix for Numerical Features')
plt.show()
numerical_data.to_csv('reduced_data.csv', index=False)
Removed perimeter_mean due to high correlation with radius_mean Removed area_mean due to high correlation with radius_mean Removed area_se due to high correlation with perimeter_se Removed radius_worst due to high correlation with radius_mean Removed texture_worst due to high correlation with texture_mean Removed perimeter_worst due to high correlation with radius_mean Removed area_worst due to high correlation with radius_mean Removed smoothness_worst due to high correlation with smoothness_mean Removed compactness_worst due to high correlation with compactness_mean
<Figure size 2000x2000 with 0 Axes>
<Axes: >
Text(0.5, 1.0, 'Final Correlation Matrix for Numerical Features')
Standardize or normalize your data especially for ANN.
scaler = StandardScaler()
data[data.columns] = scaler.fit_transform(data[data.columns])
def create_target_column(dataset):
dataset['target'] = (dataset['survival_probability'] > 0.5).astype(int)
dataset.drop(columns=['survival_probability'], inplace=True)
return dataset
def get_outliers_mask(dataset):
numerical_columns = dataset.select_dtypes(include=['int64', 'float64']).columns
outliers_mask = pd.Series([False] * len(dataset), index=dataset.index)
for column in numerical_columns:
Q1 = dataset[column].quantile(0.25)
Q3 = dataset[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_mask |= (dataset[column] < lower_bound) | (dataset[column] > upper_bound)
return outliers_mask
def outliers_removal(X, y):
outliers_mask = get_outliers_mask(X)
X = X[~outliers_mask]
y = y.loc[X.index]
return X, y
def impute_values(X):
X.replace('?', pd.NA, inplace=True)
X.dropna(inplace=True)
return X
def transform_features(X, skewness_threshold=0.75):
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
skewed_features = X[numerical_columns].skew().sort_values(ascending=False)
for feature in skewed_features.index:
skewness = skewed_features[feature]
if skewness > skewness_threshold:
if (X[feature] > 0).all():
X[feature] = np.log1p(X[feature])
else:
print(f"Skipping log transformation for {feature} due to zero or negative values.")
elif skewness < -skewness_threshold:
X[feature] = np.sqrt(np.abs(X[feature]))
return X
def remove_skewed_features(X, skewness_threshold=0.75):
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
skewed_features = X[numerical_columns].skew().sort_values(ascending=False)
features_to_remove = skewed_features[abs(skewed_features) > skewness_threshold].index
if len(features_to_remove) > 0:
X = X.drop(columns=features_to_remove)
return X
def one_hot_encode(X):
categorical_columns = ['diagnosis', 'stage_of_cancer', 'treatment_administered']
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
return X
def feature_selection_correlation(X, correlation_threshold=0.8):
def find_most_correlated(data, threshold):
correlation_matrix = data.corr()
features = correlation_matrix.columns
max_correlation = threshold
feature_to_remove = None
for i in range(len(features)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > max_correlation:
max_correlation = abs(correlation_matrix.iloc[i, j])
feature_to_remove = features[i] if abs(correlation_matrix.iloc[i, j]) >= abs(correlation_matrix.iloc[j, i]) else features[j]
return feature_to_remove
while True:
feature_to_remove = find_most_correlated(X, correlation_threshold)
if feature_to_remove is None:
break
X = X.drop(columns=[feature_to_remove])
return X
def split_data(dataset):
X = dataset.drop(columns=['target'])
y = dataset['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
return X_train, X_test, y_train, y_test
def apply_transformations(X, y):
X, y = outliers_removal(X, y)
X = impute_values(X)
X = transform_features(X)
X = remove_skewed_features(X)
X = one_hot_encode(X)
X = feature_selection_correlation(X)
return X, y
def process_data(dataset):
dataset = create_target_column(dataset)
X_train, X_test, y_train, y_test = split_data(dataset)
X_train, y_train = apply_transformations(X_train, y_train)
X_test, y_test = apply_transformations(X_test, y_test)
common_columns = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_columns]
X_test = X_test[common_columns]
return X_train, X_test, y_train, y_test
td = pd.read_csv('data.csv')
X_train, X_test, y_train, y_test = process_data(td)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
results = {}
<ipython-input-19-b0ac60b7fdca>:31: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X.replace('?', pd.NA, inplace=True)
<ipython-input-19-b0ac60b7fdca>:32: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X.dropna(inplace=True)
<ipython-input-19-b0ac60b7fdca>:45: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X[feature] = np.log1p(X[feature])
Skipping log transformation for concavity_mean due to zero or negative values. Skipping log transformation for concave points_mean due to zero or negative values. Skipping log transformation for concavity_se due to zero or negative values. Skipping log transformation for concavity_worst due to zero or negative values. Skipping log transformation for concavity_mean due to zero or negative values. Skipping log transformation for concave points_mean due to zero or negative values. Skipping log transformation for concavity_se due to zero or negative values. Skipping log transformation for concavity_worst due to zero or negative values. X_train shape: (255, 21) X_test shape: (61, 21) y_train shape: (255,) y_test shape: (61,)
<ipython-input-19-b0ac60b7fdca>:31: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X.replace('?', pd.NA, inplace=True)
<ipython-input-19-b0ac60b7fdca>:32: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X.dropna(inplace=True)
<ipython-input-19-b0ac60b7fdca>:45: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X[feature] = np.log1p(X[feature])
This function will just train the logistic regression model on the entire training set without any cross-validation.
def train_logistic_regression(X_train, y_train):
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
return model
This function will perform cross-validation using 5 folds.
def train_and_evaluate_logistic_regression_with_cv(X_train, y_train, n_splits=5):
model = LogisticRegression(max_iter=1000, random_state=42)
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
cv_results = []
for train_index, val_index in kf.split(X_train):
X_train_k, X_val_k = X_train.iloc[train_index], X_train.iloc[val_index]
y_train_k, y_val_k = y_train.iloc[train_index], y_train.iloc[val_index]
model.fit(X_train_k, y_train_k)
results = evaluate_logistic_regression(model, X_val_k, y_val_k, f"Validation Fold {len(cv_results)+1}")
cv_results.append(results)
# Calculate average of the results
average_results = pd.DataFrame(cv_results).mean().to_dict()
print(f"Average K-Folds results: {average_results}")
return model, average_results
This function evaluates the model and returns the results.
def evaluate_logistic_regression(model, X, y, dataset_name):
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
accuracy = accuracy_score(y, y_pred)
roc_auc = roc_auc_score(y, y_prob)
cm = confusion_matrix(y, y_pred)
sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
plt.figure(figsize=(6, 3))
fpr, tpr, _ = roc_curve(y, y_prob)
plt.plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve - {dataset_name}')
plt.legend()
plt.show()
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", cbar=False, square=True)
plt.title(f'Confusion Matrix - {dataset_name}')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
return {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Sensitivity': sensitivity, 'Specificity': specificity}
initial_lr_model = train_logistic_regression(X_train, y_train)
original_train_results = evaluate_logistic_regression(initial_lr_model, X_train, y_train, "Initial Train Set")
cv_lr_model, cv_train_results = train_and_evaluate_logistic_regression_with_cv(X_train, y_train)
test_results = evaluate_logistic_regression(cv_lr_model, X_test, y_test, "Test Set")
test_results = evaluate_logistic_regression(cv_lr_model, X_test, y_test, "Test Set")
results_df = pd.DataFrame({
'Logistic Regression': original_train_results,
'Logistic Regression (CV)': cv_train_results,
'Logistic Regression (Test)': test_results
}).T
display(results_df)
Average K-Folds results: {'Accuracy': 0.9254901960784314, 'ROC AUC': 0.9819241461520323, 'Sensitivity': 0.9485377236193934, 'Specificity': 0.8938641069075853}
| Accuracy | ROC AUC | Sensitivity | Specificity | |
|---|---|---|---|---|
| Logistic Regression | 0.952941 | 0.992431 | 0.962025 | 0.938144 |
| Logistic Regression (CV) | 0.925490 | 0.981924 | 0.948538 | 0.893864 |
| Logistic Regression (Test) | 0.918033 | 0.970862 | 0.923077 | 0.909091 |
def train_decision_tree(X_train, y_train, criterion='gini', max_depth=3, min_samples_split=10, min_samples_leaf=5):
dt_model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
random_state=42)
dt_model.fit(X_train, y_train)
return dt_model
def train_decision_tree_with_cv(X_train, y_train, n_splits=5, criterion='gini', max_depth=3, min_samples_split=10, min_samples_leaf=5):
model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
random_state=42)
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
for train_index, val_index in kf.split(X_train):
X_train_k, _ = X_train.iloc[train_index], X_train.iloc[val_index]
y_train_k, _ = y_train.iloc[train_index], y_train.iloc[val_index]
model.fit(X_train_k, y_train_k)
return model
def visualize_tree(dt_model, feature_names):
plt.figure(figsize=(20, 10))
plot_tree(dt_model, filled=True, feature_names=feature_names, class_names=['0', '1'], rounded=True)
plt.title('Decision Tree Visualization')
plt.show()
def evaluate_decision_tree(model, X, y, dataset_name):
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
accuracy = accuracy_score(y, y_pred)
roc_auc = roc_auc_score(y, y_prob)
cm = confusion_matrix(y, y_pred)
sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
plt.figure(figsize=(6, 3))
fpr, tpr, _ = roc_curve(y, y_prob)
plt.plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve - {dataset_name}')
plt.legend()
plt.show()
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", cbar=False, square=True)
plt.title(f'Confusion Matrix - {dataset_name}')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
return {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Sensitivity': sensitivity, 'Specificity': specificity}
dt_initial = train_decision_tree(X_train, y_train)
initial_train_results_dt = evaluate_decision_tree(dt_initial, X_train, y_train, "Decision Tree - Initial Train Set")
visualize_tree(dt_initial, X_train.columns)
results["Decision Tree - Initial Train Set"] = initial_train_results_dt
results_df = pd.DataFrame({
'Logistic Regression': original_train_results,
'Logistic Regression (CV)': cv_train_results,
'Logistic Regression (Test)': test_results,
'Decision Tree': initial_train_results_dt,
}).T
display(results_df)
| Accuracy | ROC AUC | Sensitivity | Specificity | |
|---|---|---|---|---|
| Logistic Regression | 0.952941 | 0.992431 | 0.962025 | 0.938144 |
| Logistic Regression (CV) | 0.925490 | 0.981924 | 0.948538 | 0.893864 |
| Logistic Regression (Test) | 0.918033 | 0.970862 | 0.923077 | 0.909091 |
| Decision Tree | 0.862745 | 0.943886 | 0.936709 | 0.742268 |
dt_cv = train_decision_tree_with_cv(X_train, y_train)
cv_train_results_dt = evaluate_decision_tree(dt_cv, X_train, y_train, "Decision Tree - CV Train Set")
visualize_tree(dt_cv, X_train.columns)
results["Decision Tree - CV Train Set"] = cv_train_results_dt
results_df = pd.DataFrame({
'Logistic Regression': original_train_results,
'Logistic Regression (CV)': cv_train_results,
'Logistic Regression (Test)': test_results,
'Decision Tree': initial_train_results_dt,
'Decision Tree (CV)': cv_train_results_dt
}).T
display(results_df)
| Accuracy | ROC AUC | Sensitivity | Specificity | |
|---|---|---|---|---|
| Logistic Regression | 0.952941 | 0.992431 | 0.962025 | 0.938144 |
| Logistic Regression (CV) | 0.925490 | 0.981924 | 0.948538 | 0.893864 |
| Logistic Regression (Test) | 0.918033 | 0.970862 | 0.923077 | 0.909091 |
| Decision Tree | 0.862745 | 0.943886 | 0.936709 | 0.742268 |
| Decision Tree (CV) | 0.858824 | 0.924670 | 0.822785 | 0.917526 |
test_results_dt = evaluate_decision_tree(dt_cv, X_test, y_test, "Decision Tree - Test Set")
results["Decision Tree - Test Set"] = test_results_dt
results_df = pd.DataFrame({
'Logistic Regression': original_train_results,
'Logistic Regression (CV)': cv_train_results,
'Logistic Regression (Test)': test_results,
'Decision Tree': initial_train_results_dt,
'Decision Tree (CV)': cv_train_results_dt,
'Decision Tree (Test)': test_results_dt
}).T
display(results_df)
| Accuracy | ROC AUC | Sensitivity | Specificity | |
|---|---|---|---|---|
| Logistic Regression | 0.952941 | 0.992431 | 0.962025 | 0.938144 |
| Logistic Regression (CV) | 0.925490 | 0.981924 | 0.948538 | 0.893864 |
| Logistic Regression (Test) | 0.918033 | 0.970862 | 0.923077 | 0.909091 |
| Decision Tree | 0.862745 | 0.943886 | 0.936709 | 0.742268 |
| Decision Tree (CV) | 0.858824 | 0.924670 | 0.822785 | 0.917526 |
| Decision Tree (Test) | 0.786885 | 0.828089 | 0.820513 | 0.727273 |
Simulating CHAID
dt_model_gini = train_decision_tree(X_train, y_train, criterion='gini', max_depth=3)
evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth')
visualize_tree(dt_model_gini, X_train.columns)
dt_model_entropy = train_decision_tree(X_train, y_train, criterion='entropy', max_depth=3)
evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth')
visualize_tree(dt_model_entropy, X_train.columns)
results_df = pd.DataFrame({
'Logistic Regression': original_train_results,
'Logistic Regression (CV)': cv_train_results,
'Logistic Regression (Test)': test_results,
'Decision Tree': initial_train_results_dt,
'Decision Tree (CV)': cv_train_results_dt,
'Decision Tree (Test)': test_results_dt,
'Decision Tree - Gini - Limited Depth': evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth'),
'Decision Tree - Entropy - Limited Depth': evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth')
}).T
display(results_df)
{'Accuracy': 0.8032786885245902,
'ROC AUC': 0.8315850815850816,
'Sensitivity': 0.9487179487179487,
'Specificity': 0.5454545454545454}
{'Accuracy': 0.7868852459016393,
'ROC AUC': 0.784965034965035,
'Sensitivity': 0.8974358974358975,
'Specificity': 0.5909090909090909}
| Accuracy | ROC AUC | Sensitivity | Specificity | |
|---|---|---|---|---|
| Logistic Regression | 0.952941 | 0.992431 | 0.962025 | 0.938144 |
| Logistic Regression (CV) | 0.925490 | 0.981924 | 0.948538 | 0.893864 |
| Logistic Regression (Test) | 0.918033 | 0.970862 | 0.923077 | 0.909091 |
| Decision Tree | 0.862745 | 0.943886 | 0.936709 | 0.742268 |
| Decision Tree (CV) | 0.858824 | 0.924670 | 0.822785 | 0.917526 |
| Decision Tree (Test) | 0.786885 | 0.828089 | 0.820513 | 0.727273 |
| Decision Tree - Gini - Limited Depth | 0.803279 | 0.831585 | 0.948718 | 0.545455 |
| Decision Tree - Entropy - Limited Depth | 0.786885 | 0.784965 | 0.897436 | 0.590909 |
Train the Neural Network (ANN)
def train_neural_network(X_train, y_train, hidden_layer_sizes=(100,), activation='relu', max_iter=1000):
ann_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=max_iter, random_state=42, verbose=True)
ann_model.fit(X_train, y_train)
return ann_model
def train_neural_network_with_kfolds(X, y, n_splits=5, hidden_layer_sizes=(100,), activation='relu', max_iter=1000):
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
models = []
for train_index, val_index in kf.split(X):
X_train, X_val = X.iloc[train_index], X.iloc[val_index]
y_train, y_val = y.iloc[train_index], y.iloc[val_index]
ann_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=max_iter, random_state=42, verbose=True)
ann_model.fit(X_train, y_train)
models.append(ann_model)
return models
Evaluate the Neural Network
def evaluate_neural_network(ann_model, X_train, y_train, model_name):
y_pred = ann_model.predict(X_train)
y_prob = ann_model.predict_proba(X_train)[:, 1]
roc_auc = roc_auc_score(y_train, y_prob)
fpr, tpr, _ = roc_curve(y_train, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f'Neural Network (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Neural Network')
plt.legend()
plt.show()
cm = confusion_matrix(y_train, y_pred)
sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
accuracy = accuracy_score(y_train, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens")
plt.title('Confusion Matrix - Neural Network')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()
return {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Sensitivity': sensitivity, 'Specificity': specificity}
ann_model = train_neural_network(X_train, y_train, hidden_layer_sizes=(100,), activation='relu', max_iter=1000)
initial_train_results_ann = evaluate_neural_network(ann_model, X_train, y_train, 'Artificial Neural Network')
results["ANN - Initial Train Set"] = initial_train_results_ann
results_df = pd.DataFrame({
'Logistic Regression': original_train_results,
'Logistic Regression (CV)': cv_train_results,
'Logistic Regression (Test)': test_results,
'Decision Tree - Initial Train Set': initial_train_results_dt,
'Decision Tree - CV Train Set': cv_train_results_dt,
'Decision Tree - Test Set': test_results_dt,
'Decision Tree - Gini - Limited Depth': evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth'),
'Decision Tree - Entropy - Limited Depth': evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth'),
'ANN': initial_train_results_ann
}).T
display(results_df)
Iteration 1, loss = 4.02230292 Iteration 2, loss = 2.10454022 Iteration 3, loss = 0.86947080 Iteration 4, loss = 0.63834363 Iteration 5, loss = 0.92990311 Iteration 6, loss = 1.12074851 Iteration 7, loss = 1.02213408 Iteration 8, loss = 0.80638496 Iteration 9, loss = 0.64397050 Iteration 10, loss = 0.61286203 Iteration 11, loss = 0.65718838 Iteration 12, loss = 0.70880343 Iteration 13, loss = 0.71580951 Iteration 14, loss = 0.67234684 Iteration 15, loss = 0.62725891 Iteration 16, loss = 0.58560556 Iteration 17, loss = 0.57560980 Iteration 18, loss = 0.57066244 Iteration 19, loss = 0.56298485 Iteration 20, loss = 0.54736251 Iteration 21, loss = 0.52878874 Iteration 22, loss = 0.51326551 Iteration 23, loss = 0.49312804 Iteration 24, loss = 0.48197188 Iteration 25, loss = 0.47281459 Iteration 26, loss = 0.46184911 Iteration 27, loss = 0.44970322 Iteration 28, loss = 0.43596365 Iteration 29, loss = 0.41821720 Iteration 30, loss = 0.40762723 Iteration 31, loss = 0.39689920 Iteration 32, loss = 0.38930702 Iteration 33, loss = 0.38296267 Iteration 34, loss = 0.37670067 Iteration 35, loss = 0.36881320 Iteration 36, loss = 0.36026601 Iteration 37, loss = 0.36009198 Iteration 38, loss = 0.35783499 Iteration 39, loss = 0.34957359 Iteration 40, loss = 0.34125632 Iteration 41, loss = 0.33902456 Iteration 42, loss = 0.33582467 Iteration 43, loss = 0.32950126 Iteration 44, loss = 0.32532578 Iteration 45, loss = 0.32451970 Iteration 46, loss = 0.32281269 Iteration 47, loss = 0.32066127 Iteration 48, loss = 0.31576226 Iteration 49, loss = 0.31280710 Iteration 50, loss = 0.31048232 Iteration 51, loss = 0.30959967 Iteration 52, loss = 0.30675096 Iteration 53, loss = 0.30470100 Iteration 54, loss = 0.30382580 Iteration 55, loss = 0.30156411 Iteration 56, loss = 0.30085709 Iteration 57, loss = 0.29825943 Iteration 58, loss = 0.29573218 Iteration 59, loss = 0.29429188 Iteration 60, loss = 0.29269111 Iteration 61, loss = 0.29140750 Iteration 62, loss = 0.29282380 Iteration 63, loss = 0.29039945 Iteration 64, loss = 0.28653339 Iteration 65, loss = 0.28746691 Iteration 66, loss = 0.28743537 Iteration 67, loss = 0.28227269 Iteration 68, loss = 0.28445777 Iteration 69, loss = 0.28291903 Iteration 70, loss = 0.27880415 Iteration 71, loss = 0.27871387 Iteration 72, loss = 0.27958179 Iteration 73, loss = 0.27518067 Iteration 74, loss = 0.27381807 Iteration 75, loss = 0.27147686 Iteration 76, loss = 0.27001834 Iteration 77, loss = 0.26881486 Iteration 78, loss = 0.26853387 Iteration 79, loss = 0.26798311 Iteration 80, loss = 0.26576272 Iteration 81, loss = 0.26483172 Iteration 82, loss = 0.26545454 Iteration 83, loss = 0.26324829 Iteration 84, loss = 0.26074457 Iteration 85, loss = 0.26010622 Iteration 86, loss = 0.25794394 Iteration 87, loss = 0.25850639 Iteration 88, loss = 0.25669207 Iteration 89, loss = 0.25484294 Iteration 90, loss = 0.25658803 Iteration 91, loss = 0.25209274 Iteration 92, loss = 0.25700625 Iteration 93, loss = 0.25482831 Iteration 94, loss = 0.24928817 Iteration 95, loss = 0.25103163 Iteration 96, loss = 0.24688681 Iteration 97, loss = 0.24745477 Iteration 98, loss = 0.25403423 Iteration 99, loss = 0.24920378 Iteration 100, loss = 0.24413673 Iteration 101, loss = 0.24500362 Iteration 102, loss = 0.24273261 Iteration 103, loss = 0.24267903 Iteration 104, loss = 0.24122914 Iteration 105, loss = 0.23857052 Iteration 106, loss = 0.23781333 Iteration 107, loss = 0.23669402 Iteration 108, loss = 0.23579680 Iteration 109, loss = 0.23448541 Iteration 110, loss = 0.23481622 Iteration 111, loss = 0.23363717 Iteration 112, loss = 0.23239794 Iteration 113, loss = 0.23182923 Iteration 114, loss = 0.23044262 Iteration 115, loss = 0.22943965 Iteration 116, loss = 0.23016530 Iteration 117, loss = 0.22802171 Iteration 118, loss = 0.22735506 Iteration 119, loss = 0.23126033 Iteration 120, loss = 0.22780703 Iteration 121, loss = 0.22509048 Iteration 122, loss = 0.22728015 Iteration 123, loss = 0.22363601 Iteration 124, loss = 0.22647265 Iteration 125, loss = 0.22786131 Iteration 126, loss = 0.22139982 Iteration 127, loss = 0.22084346 Iteration 128, loss = 0.21927932 Iteration 129, loss = 0.21913891 Iteration 130, loss = 0.21673575 Iteration 131, loss = 0.21834780 Iteration 132, loss = 0.22036976 Iteration 133, loss = 0.21580011 Iteration 134, loss = 0.21415451 Iteration 135, loss = 0.21484682 Iteration 136, loss = 0.21173361 Iteration 137, loss = 0.21456477 Iteration 138, loss = 0.21810526 Iteration 139, loss = 0.21252006 Iteration 140, loss = 0.21039078 Iteration 141, loss = 0.21035409 Iteration 142, loss = 0.20973249 Iteration 143, loss = 0.20748699 Iteration 144, loss = 0.20696330 Iteration 145, loss = 0.20592567 Iteration 146, loss = 0.20578461 Iteration 147, loss = 0.20528378 Iteration 148, loss = 0.20517077 Iteration 149, loss = 0.20504235 Iteration 150, loss = 0.20473887 Iteration 151, loss = 0.20418614 Iteration 152, loss = 0.20357433 Iteration 153, loss = 0.20130895 Iteration 154, loss = 0.20459066 Iteration 155, loss = 0.20086647 Iteration 156, loss = 0.20362074 Iteration 157, loss = 0.20427172 Iteration 158, loss = 0.19801184 Iteration 159, loss = 0.19825854 Iteration 160, loss = 0.19987893 Iteration 161, loss = 0.19808404 Iteration 162, loss = 0.19706006 Iteration 163, loss = 0.19545960 Iteration 164, loss = 0.19198142 Iteration 165, loss = 0.19874646 Iteration 166, loss = 0.19975094 Iteration 167, loss = 0.18969855 Iteration 168, loss = 0.19592014 Iteration 169, loss = 0.20318988 Iteration 170, loss = 0.18861323 Iteration 171, loss = 0.19687122 Iteration 172, loss = 0.19971058 Iteration 173, loss = 0.18872662 Iteration 174, loss = 0.19407785 Iteration 175, loss = 0.18840658 Iteration 176, loss = 0.18642795 Iteration 177, loss = 0.20257853 Iteration 178, loss = 0.19036916 Iteration 179, loss = 0.18954640 Iteration 180, loss = 0.19262127 Iteration 181, loss = 0.18555495 Iteration 182, loss = 0.18221644 Iteration 183, loss = 0.18179603 Iteration 184, loss = 0.18384272 Iteration 185, loss = 0.18038333 Iteration 186, loss = 0.18837300 Iteration 187, loss = 0.18552340 Iteration 188, loss = 0.17813300 Iteration 189, loss = 0.17841459 Iteration 190, loss = 0.17746224 Iteration 191, loss = 0.17615905 Iteration 192, loss = 0.17706448 Iteration 193, loss = 0.17445022 Iteration 194, loss = 0.17870040 Iteration 195, loss = 0.17945170 Iteration 196, loss = 0.17474412 Iteration 197, loss = 0.17325657 Iteration 198, loss = 0.17223660 Iteration 199, loss = 0.17192794 Iteration 200, loss = 0.17387431 Iteration 201, loss = 0.17361957 Iteration 202, loss = 0.17248654 Iteration 203, loss = 0.17401968 Iteration 204, loss = 0.17101373 Iteration 205, loss = 0.17081506 Iteration 206, loss = 0.17097379 Iteration 207, loss = 0.16758544 Iteration 208, loss = 0.16972054 Iteration 209, loss = 0.17010854 Iteration 210, loss = 0.16597442 Iteration 211, loss = 0.16795522 Iteration 212, loss = 0.16823160 Iteration 213, loss = 0.16418037 Iteration 214, loss = 0.16697008 Iteration 215, loss = 0.16582084 Iteration 216, loss = 0.16382480 Iteration 217, loss = 0.16484471 Iteration 218, loss = 0.16243197 Iteration 219, loss = 0.16218383 Iteration 220, loss = 0.16236868 Iteration 221, loss = 0.16106752 Iteration 222, loss = 0.16034033 Iteration 223, loss = 0.16019903 Iteration 224, loss = 0.15939317 Iteration 225, loss = 0.15812903 Iteration 226, loss = 0.15967220 Iteration 227, loss = 0.15972957 Iteration 228, loss = 0.15869792 Iteration 229, loss = 0.15701914 Iteration 230, loss = 0.15660028 Iteration 231, loss = 0.15681639 Iteration 232, loss = 0.15677736 Iteration 233, loss = 0.15779404 Iteration 234, loss = 0.16085464 Iteration 235, loss = 0.17011883 Iteration 236, loss = 0.15488930 Iteration 237, loss = 0.16842559 Iteration 238, loss = 0.15835579 Iteration 239, loss = 0.15875102 Iteration 240, loss = 0.16379257 Iteration 241, loss = 0.15261567 Iteration 242, loss = 0.15714449 Iteration 243, loss = 0.15278826 Iteration 244, loss = 0.15089522 Iteration 245, loss = 0.15671092 Iteration 246, loss = 0.15103484 Iteration 247, loss = 0.15231768 Iteration 248, loss = 0.15242954 Iteration 249, loss = 0.14729689 Iteration 250, loss = 0.14944591 Iteration 251, loss = 0.15086011 Iteration 252, loss = 0.14731480 Iteration 253, loss = 0.14643971 Iteration 254, loss = 0.14697819 Iteration 255, loss = 0.14535753 Iteration 256, loss = 0.14441778 Iteration 257, loss = 0.14441094 Iteration 258, loss = 0.14371784 Iteration 259, loss = 0.14410804 Iteration 260, loss = 0.14373181 Iteration 261, loss = 0.14281049 Iteration 262, loss = 0.14266847 Iteration 263, loss = 0.14150942 Iteration 264, loss = 0.14135549 Iteration 265, loss = 0.14306235 Iteration 266, loss = 0.13924119 Iteration 267, loss = 0.14683114 Iteration 268, loss = 0.14989169 Iteration 269, loss = 0.14164417 Iteration 270, loss = 0.14145437 Iteration 271, loss = 0.13924703 Iteration 272, loss = 0.13965202 Iteration 273, loss = 0.14137038 Iteration 274, loss = 0.14434584 Iteration 275, loss = 0.13998019 Iteration 276, loss = 0.14006837 Iteration 277, loss = 0.13590659 Iteration 278, loss = 0.14019764 Iteration 279, loss = 0.14451119 Iteration 280, loss = 0.13560581 Iteration 281, loss = 0.13674836 Iteration 282, loss = 0.13751512 Iteration 283, loss = 0.13359945 Iteration 284, loss = 0.13567546 Iteration 285, loss = 0.13645231 Iteration 286, loss = 0.13285221 Iteration 287, loss = 0.13333664 Iteration 288, loss = 0.13635857 Iteration 289, loss = 0.13312503 Iteration 290, loss = 0.13223208 Iteration 291, loss = 0.13965799 Iteration 292, loss = 0.13122649 Iteration 293, loss = 0.13519012 Iteration 294, loss = 0.13540466 Iteration 295, loss = 0.13035279 Iteration 296, loss = 0.13180443 Iteration 297, loss = 0.12740952 Iteration 298, loss = 0.13376283 Iteration 299, loss = 0.13312770 Iteration 300, loss = 0.12937742 Iteration 301, loss = 0.13417229 Iteration 302, loss = 0.12919634 Iteration 303, loss = 0.12877167 Iteration 304, loss = 0.12688507 Iteration 305, loss = 0.12518773 Iteration 306, loss = 0.12507342 Iteration 307, loss = 0.12510165 Iteration 308, loss = 0.12449899 Iteration 309, loss = 0.12375685 Iteration 310, loss = 0.12734862 Iteration 311, loss = 0.12557102 Iteration 312, loss = 0.12164985 Iteration 313, loss = 0.13707716 Iteration 314, loss = 0.12867239 Iteration 315, loss = 0.12589512 Iteration 316, loss = 0.13253958 Iteration 317, loss = 0.12346435 Iteration 318, loss = 0.12208080 Iteration 319, loss = 0.12245356 Iteration 320, loss = 0.11961748 Iteration 321, loss = 0.12394614 Iteration 322, loss = 0.12021753 Iteration 323, loss = 0.12164048 Iteration 324, loss = 0.12626440 Iteration 325, loss = 0.12032976 Iteration 326, loss = 0.12178975 Iteration 327, loss = 0.12213310 Iteration 328, loss = 0.11803913 Iteration 329, loss = 0.12114994 Iteration 330, loss = 0.11725703 Iteration 331, loss = 0.12076094 Iteration 332, loss = 0.12026022 Iteration 333, loss = 0.11602327 Iteration 334, loss = 0.11995465 Iteration 335, loss = 0.11718524 Iteration 336, loss = 0.11609747 Iteration 337, loss = 0.11760468 Iteration 338, loss = 0.11490976 Iteration 339, loss = 0.11496963 Iteration 340, loss = 0.11422677 Iteration 341, loss = 0.11375234 Iteration 342, loss = 0.11340566 Iteration 343, loss = 0.11351552 Iteration 344, loss = 0.11356643 Iteration 345, loss = 0.11311543 Iteration 346, loss = 0.11259733 Iteration 347, loss = 0.11124089 Iteration 348, loss = 0.11185655 Iteration 349, loss = 0.11200215 Iteration 350, loss = 0.11118142 Iteration 351, loss = 0.11030712 Iteration 352, loss = 0.11021561 Iteration 353, loss = 0.10982264 Iteration 354, loss = 0.10987060 Iteration 355, loss = 0.10924542 Iteration 356, loss = 0.10947071 Iteration 357, loss = 0.10895485 Iteration 358, loss = 0.10889616 Iteration 359, loss = 0.10804792 Iteration 360, loss = 0.10791342 Iteration 361, loss = 0.10907126 Iteration 362, loss = 0.10781227 Iteration 363, loss = 0.10740163 Iteration 364, loss = 0.10885671 Iteration 365, loss = 0.10664161 Iteration 366, loss = 0.11066026 Iteration 367, loss = 0.10595491 Iteration 368, loss = 0.10856119 Iteration 369, loss = 0.11647613 Iteration 370, loss = 0.10547562 Iteration 371, loss = 0.11319035 Iteration 372, loss = 0.11453445 Iteration 373, loss = 0.10660037 Iteration 374, loss = 0.10672690 Iteration 375, loss = 0.10351444 Iteration 376, loss = 0.10486834 Iteration 377, loss = 0.10980695 Iteration 378, loss = 0.10328256 Iteration 379, loss = 0.10588789 Iteration 380, loss = 0.10731047 Iteration 381, loss = 0.10215224 Iteration 382, loss = 0.10181712 Iteration 383, loss = 0.10125605 Iteration 384, loss = 0.10228608 Iteration 385, loss = 0.10076854 Iteration 386, loss = 0.10212498 Iteration 387, loss = 0.10126292 Iteration 388, loss = 0.10246588 Iteration 389, loss = 0.10508157 Iteration 390, loss = 0.10176577 Iteration 391, loss = 0.10079252 Iteration 392, loss = 0.09880238 Iteration 393, loss = 0.10208954 Iteration 394, loss = 0.09894385 Iteration 395, loss = 0.10188564 Iteration 396, loss = 0.10125334 Iteration 397, loss = 0.09814275 Iteration 398, loss = 0.09847286 Iteration 399, loss = 0.09768785 Iteration 400, loss = 0.09746975 Iteration 401, loss = 0.09705566 Iteration 402, loss = 0.09770101 Iteration 403, loss = 0.09765786 Iteration 404, loss = 0.09823282 Iteration 405, loss = 0.09750841 Iteration 406, loss = 0.09622743 Iteration 407, loss = 0.09536445 Iteration 408, loss = 0.09492604 Iteration 409, loss = 0.09453060 Iteration 410, loss = 0.09465540 Iteration 411, loss = 0.09405552 Iteration 412, loss = 0.09404265 Iteration 413, loss = 0.09435809 Iteration 414, loss = 0.09354805 Iteration 415, loss = 0.09284339 Iteration 416, loss = 0.09329854 Iteration 417, loss = 0.09286583 Iteration 418, loss = 0.09342240 Iteration 419, loss = 0.09173622 Iteration 420, loss = 0.09311403 Iteration 421, loss = 0.09077651 Iteration 422, loss = 0.09321873 Iteration 423, loss = 0.09357174 Iteration 424, loss = 0.09166461 Iteration 425, loss = 0.09352065 Iteration 426, loss = 0.08989276 Iteration 427, loss = 0.09230079 Iteration 428, loss = 0.09229090 Iteration 429, loss = 0.08877926 Iteration 430, loss = 0.09419020 Iteration 431, loss = 0.09059947 Iteration 432, loss = 0.09079556 Iteration 433, loss = 0.09054734 Iteration 434, loss = 0.08770177 Iteration 435, loss = 0.09359509 Iteration 436, loss = 0.08819395 Iteration 437, loss = 0.09153621 Iteration 438, loss = 0.09317750 Iteration 439, loss = 0.08635020 Iteration 440, loss = 0.09001044 Iteration 441, loss = 0.09009227 Iteration 442, loss = 0.08569725 Iteration 443, loss = 0.09009921 Iteration 444, loss = 0.08791455 Iteration 445, loss = 0.08970940 Iteration 446, loss = 0.09254000 Iteration 447, loss = 0.08717615 Iteration 448, loss = 0.08879322 Iteration 449, loss = 0.08509210 Iteration 450, loss = 0.08938803 Iteration 451, loss = 0.08714110 Iteration 452, loss = 0.08362052 Iteration 453, loss = 0.08622018 Iteration 454, loss = 0.08347077 Iteration 455, loss = 0.08590590 Iteration 456, loss = 0.08621327 Iteration 457, loss = 0.08234254 Iteration 458, loss = 0.08418872 Iteration 459, loss = 0.08382589 Iteration 460, loss = 0.08233419 Iteration 461, loss = 0.08205337 Iteration 462, loss = 0.08343911 Iteration 463, loss = 0.08199110 Iteration 464, loss = 0.08120543 Iteration 465, loss = 0.08116426 Iteration 466, loss = 0.08080704 Iteration 467, loss = 0.08046211 Iteration 468, loss = 0.08026456 Iteration 469, loss = 0.08064470 Iteration 470, loss = 0.07995165 Iteration 471, loss = 0.08244801 Iteration 472, loss = 0.08266426 Iteration 473, loss = 0.08347053 Iteration 474, loss = 0.08072649 Iteration 475, loss = 0.07981491 Iteration 476, loss = 0.08162995 Iteration 477, loss = 0.07846417 Iteration 478, loss = 0.08384241 Iteration 479, loss = 0.08154267 Iteration 480, loss = 0.07840825 Iteration 481, loss = 0.08353738 Iteration 482, loss = 0.08160882 Iteration 483, loss = 0.07852621 Iteration 484, loss = 0.07844408 Iteration 485, loss = 0.07728118 Iteration 486, loss = 0.07703614 Iteration 487, loss = 0.07705122 Iteration 488, loss = 0.07667738 Iteration 489, loss = 0.07577607 Iteration 490, loss = 0.07723084 Iteration 491, loss = 0.07596730 Iteration 492, loss = 0.07628094 Iteration 493, loss = 0.07777272 Iteration 494, loss = 0.07465312 Iteration 495, loss = 0.07940364 Iteration 496, loss = 0.08560379 Iteration 497, loss = 0.07329538 Iteration 498, loss = 0.08452327 Iteration 499, loss = 0.08268718 Iteration 500, loss = 0.07561424 Iteration 501, loss = 0.07888863 Iteration 502, loss = 0.07339273 Iteration 503, loss = 0.07714437 Iteration 504, loss = 0.07403349 Iteration 505, loss = 0.07521156 Iteration 506, loss = 0.08397237 Iteration 507, loss = 0.07553793 Iteration 508, loss = 0.07679296 Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
| Accuracy | ROC AUC | Sensitivity | Specificity | |
|---|---|---|---|---|
| Logistic Regression | 0.952941 | 0.992431 | 0.962025 | 0.938144 |
| Logistic Regression (CV) | 0.925490 | 0.981924 | 0.948538 | 0.893864 |
| Logistic Regression (Test) | 0.918033 | 0.970862 | 0.923077 | 0.909091 |
| Decision Tree - Initial Train Set | 0.862745 | 0.943886 | 0.936709 | 0.742268 |
| Decision Tree - CV Train Set | 0.858824 | 0.924670 | 0.822785 | 0.917526 |
| Decision Tree - Test Set | 0.786885 | 0.828089 | 0.820513 | 0.727273 |
| Decision Tree - Gini - Limited Depth | 0.803279 | 0.831585 | 0.948718 | 0.545455 |
| Decision Tree - Entropy - Limited Depth | 0.786885 | 0.784965 | 0.897436 | 0.590909 |
| ANN | 0.984314 | 0.999739 | 1.000000 | 0.958763 |
models = train_neural_network_with_kfolds(X_train, y_train, hidden_layer_sizes=(100,), activation='relu', max_iter=1000)
cv_train_results_ann = evaluate_neural_network(models[-1], X_train, y_train, 'Artificial Neural Network')
results["ANN - CV Train Set"] = cv_train_results_ann
results_df = pd.DataFrame({
'Logistic Regression': original_train_results,
'Logistic Regression (CV)': cv_train_results,
'Logistic Regression (Test)': test_results,
'Decision Tree - Initial Train Set': initial_train_results_dt,
'Decision Tree - CV Train Set': cv_train_results_dt,
'Decision Tree - Test Set': test_results_dt,
'Decision Tree - Gini - Limited Depth': evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth'),
'Decision Tree - Entropy - Limited Depth': evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth'),
'ANN': initial_train_results_ann,
'Ann (CV)': cv_train_results_ann
}).T
display(results_df)
Iteration 1, loss = 4.22163960 Iteration 2, loss = 2.23266392 Iteration 3, loss = 0.98520147 Iteration 4, loss = 0.54672483 Iteration 5, loss = 0.66943191 Iteration 6, loss = 0.93194218 Iteration 7, loss = 1.02349321 Iteration 8, loss = 0.96440416 Iteration 9, loss = 0.79275934 Iteration 10, loss = 0.62363374 Iteration 11, loss = 0.56077809 Iteration 12, loss = 0.57959426 Iteration 13, loss = 0.61640017 Iteration 14, loss = 0.64929655 Iteration 15, loss = 0.65909269 Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping. Iteration 1, loss = 4.22354979 Iteration 2, loss = 2.25010972 Iteration 3, loss = 0.97161520 Iteration 4, loss = 0.58184682 Iteration 5, loss = 0.69279621 Iteration 6, loss = 0.80113225 Iteration 7, loss = 0.81207305 Iteration 8, loss = 0.73692669 Iteration 9, loss = 0.62985174 Iteration 10, loss = 0.58411131 Iteration 11, loss = 0.59810975 Iteration 12, loss = 0.62565651 Iteration 13, loss = 0.63808097 Iteration 14, loss = 0.62662879 Iteration 15, loss = 0.59222289 Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping. Iteration 1, loss = 4.40308327 Iteration 2, loss = 2.35128481 Iteration 3, loss = 0.96430909 Iteration 4, loss = 0.58045854 Iteration 5, loss = 0.67514544 Iteration 6, loss = 0.82212449 Iteration 7, loss = 0.88910858 Iteration 8, loss = 0.81014449 Iteration 9, loss = 0.66751640 Iteration 10, loss = 0.57471490 Iteration 11, loss = 0.57928092 Iteration 12, loss = 0.63068796 Iteration 13, loss = 0.65174381 Iteration 14, loss = 0.62220160 Iteration 15, loss = 0.56242003 Iteration 16, loss = 0.52825348 Iteration 17, loss = 0.55181935 Iteration 18, loss = 0.62242423 Iteration 19, loss = 0.67718384 Iteration 20, loss = 0.68012165 Iteration 21, loss = 0.61506530 Iteration 22, loss = 0.53279130 Iteration 23, loss = 0.49080675 Iteration 24, loss = 0.48420794 Iteration 25, loss = 0.49438687 Iteration 26, loss = 0.51639462 Iteration 27, loss = 0.51440801 Iteration 28, loss = 0.48609095 Iteration 29, loss = 0.47502913 Iteration 30, loss = 0.47100567 Iteration 31, loss = 0.46709770 Iteration 32, loss = 0.47838106 Iteration 33, loss = 0.50439703 Iteration 34, loss = 0.46787340 Iteration 35, loss = 0.40034390 Iteration 36, loss = 0.39182700 Iteration 37, loss = 0.40217608 Iteration 38, loss = 0.38545705 Iteration 39, loss = 0.36846385 Iteration 40, loss = 0.36162610 Iteration 41, loss = 0.36822233 Iteration 42, loss = 0.38895401 Iteration 43, loss = 0.36801980 Iteration 44, loss = 0.35042417 Iteration 45, loss = 0.34514174 Iteration 46, loss = 0.33672332 Iteration 47, loss = 0.33139784 Iteration 48, loss = 0.32938864 Iteration 49, loss = 0.33058718 Iteration 50, loss = 0.34757384 Iteration 51, loss = 0.35010392 Iteration 52, loss = 0.32402906 Iteration 53, loss = 0.35111728 Iteration 54, loss = 0.39375771 Iteration 55, loss = 0.36079342 Iteration 56, loss = 0.31791175 Iteration 57, loss = 0.32537116 Iteration 58, loss = 0.32398032 Iteration 59, loss = 0.31360946 Iteration 60, loss = 0.31106698 Iteration 61, loss = 0.31525581 Iteration 62, loss = 0.33791280 Iteration 63, loss = 0.34376663 Iteration 64, loss = 0.31549679 Iteration 65, loss = 0.30202004 Iteration 66, loss = 0.30222907 Iteration 67, loss = 0.32165497 Iteration 68, loss = 0.35633922 Iteration 69, loss = 0.35763595 Iteration 70, loss = 0.30702865 Iteration 71, loss = 0.30646405 Iteration 72, loss = 0.41109411 Iteration 73, loss = 0.38481325 Iteration 74, loss = 0.29196668 Iteration 75, loss = 0.35370177 Iteration 76, loss = 0.49051039 Iteration 77, loss = 0.51066222 Iteration 78, loss = 0.37842278 Iteration 79, loss = 0.29594399 Iteration 80, loss = 0.33059990 Iteration 81, loss = 0.35648972 Iteration 82, loss = 0.31689687 Iteration 83, loss = 0.30439458 Iteration 84, loss = 0.34936877 Iteration 85, loss = 0.39807018 Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping. Iteration 1, loss = 4.21620790 Iteration 2, loss = 2.24222414 Iteration 3, loss = 0.93538134 Iteration 4, loss = 0.61455033 Iteration 5, loss = 0.86172328 Iteration 6, loss = 1.10095280 Iteration 7, loss = 1.13128692 Iteration 8, loss = 1.05086702 Iteration 9, loss = 0.91318579 Iteration 10, loss = 0.76235227 Iteration 11, loss = 0.66191068 Iteration 12, loss = 0.63935631 Iteration 13, loss = 0.69674533 Iteration 14, loss = 0.81829953 Iteration 15, loss = 0.93396626 Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping. Iteration 1, loss = 3.92051852 Iteration 2, loss = 2.13573007 Iteration 3, loss = 0.99513238 Iteration 4, loss = 0.68912846 Iteration 5, loss = 0.90766105 Iteration 6, loss = 1.15997858 Iteration 7, loss = 1.12201840 Iteration 8, loss = 0.91680068 Iteration 9, loss = 0.76407906 Iteration 10, loss = 0.71199174 Iteration 11, loss = 0.73502274 Iteration 12, loss = 0.77686128 Iteration 13, loss = 0.80032925 Iteration 14, loss = 0.79393632 Iteration 15, loss = 0.77233785 Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
| Accuracy | ROC AUC | Sensitivity | Specificity | |
|---|---|---|---|---|
| Logistic Regression | 0.952941 | 0.992431 | 0.962025 | 0.938144 |
| Logistic Regression (CV) | 0.925490 | 0.981924 | 0.948538 | 0.893864 |
| Logistic Regression (Test) | 0.918033 | 0.970862 | 0.923077 | 0.909091 |
| Decision Tree - Initial Train Set | 0.862745 | 0.943886 | 0.936709 | 0.742268 |
| Decision Tree - CV Train Set | 0.858824 | 0.924670 | 0.822785 | 0.917526 |
| Decision Tree - Test Set | 0.786885 | 0.828089 | 0.820513 | 0.727273 |
| Decision Tree - Gini - Limited Depth | 0.803279 | 0.831585 | 0.948718 | 0.545455 |
| Decision Tree - Entropy - Limited Depth | 0.786885 | 0.784965 | 0.897436 | 0.590909 |
| ANN | 0.984314 | 0.999739 | 1.000000 | 0.958763 |
| Ann (CV) | 0.780392 | 0.865718 | 0.784810 | 0.773196 |
test_results_ann = evaluate_neural_network(models[-1], X_test, y_test, 'ANN - Test Set')
results["ANN - Test Set"] = test_results_ann
results_df = pd.DataFrame({
'Logistic Regression': original_train_results,
'Logistic Regression (CV)': cv_train_results,
'Logistic Regression (Test)': test_results,
'Decision Tree': initial_train_results_dt,
'Decision Tree (CV)': cv_train_results_dt,
'Decision Tree (Test)': test_results_dt,
'Decision Tree - Gini - Limited Depth': evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth'),
'Decision Tree - Entropy - Limited Depth': evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth'),
'ANN': initial_train_results_ann,
'ANN (CV)': cv_train_results_ann,
'ANN (Test)': test_results_ann
}).T
display(results_df)
| Accuracy | ROC AUC | Sensitivity | Specificity | |
|---|---|---|---|---|
| Logistic Regression | 0.952941 | 0.992431 | 0.962025 | 0.938144 |
| Logistic Regression (CV) | 0.925490 | 0.981924 | 0.948538 | 0.893864 |
| Logistic Regression (Test) | 0.918033 | 0.970862 | 0.923077 | 0.909091 |
| Decision Tree | 0.862745 | 0.943886 | 0.936709 | 0.742268 |
| Decision Tree (CV) | 0.858824 | 0.924670 | 0.822785 | 0.917526 |
| Decision Tree (Test) | 0.786885 | 0.828089 | 0.820513 | 0.727273 |
| Decision Tree - Gini - Limited Depth | 0.803279 | 0.831585 | 0.948718 | 0.545455 |
| Decision Tree - Entropy - Limited Depth | 0.786885 | 0.784965 | 0.897436 | 0.590909 |
| ANN | 0.984314 | 0.999739 | 1.000000 | 0.958763 |
| ANN (CV) | 0.780392 | 0.865718 | 0.784810 | 0.773196 |
| ANN (Test) | 0.737705 | 0.770396 | 0.717949 | 0.772727 |
y_pred_lr = initial_lr_model.predict(X_test)
y_pred_lr_cv = cv_lr_model.predict(X_test)
y_pred_dt = dt_initial.predict(X_test)
y_pred_dt_cv = dt_cv.predict(X_test)
y_pred_dt_gini = dt_model_gini.predict(X_test)
y_pred_dt_entropy = dt_model_entropy.predict(X_test)
y_pred_ann = ann_model.predict(X_test)
y_pred_ann_cv = models[-1].predict(X_test)
precision_scores = {
"Logistic Regression": precision_score(y_test, y_pred_lr),
"Logistic Regression - CV": precision_score(y_test, y_pred_lr_cv),
"Decision Tree": precision_score(y_test, y_pred_dt),
"Decision Tree - CV": precision_score(y_test, y_pred_dt_cv),
"Decision Tree - Gini - Limited Depth": precision_score(y_test, y_pred_dt_gini),
"Decision Tree - Entropy - Limited Depth": precision_score(y_test, y_pred_dt_entropy),
"Artificial Neural Network": precision_score(y_test, y_pred_ann),
"Artificial Neural Network - CV": precision_score(y_test, y_pred_ann_cv)
}
model_names = list(precision_scores.keys())
precision_values = list(precision_scores.values())
norm = plt.Normalize(min(precision_values), max(precision_values))
colors = [plt.cm.Greens(norm(value)) for value in precision_values]
plt.figure(figsize=(10, 8))
bar = plt.bar(model_names, precision_values, color=colors)
plt.xlabel('Model')
plt.ylabel('Precision Score')
plt.title('Comparison of Model Precision Scores')
plt.xticks(rotation=45, fontsize=6)
sm = cm.ScalarMappable(cmap=plt.cm.Greens, norm=norm)
sm.set_array([])
plt.colorbar(sm, label='Precision Score')
plt.tight_layout()
plt.show()
<Figure size 1000x800 with 0 Axes>
Text(0.5, 0, 'Model')
Text(0, 0.5, 'Precision Score')
Text(0.5, 1.0, 'Comparison of Model Precision Scores')
([0, 1, 2, 3, 4, 5, 6, 7], [Text(0, 0, 'Logistic Regression'), Text(1, 0, 'Logistic Regression - CV'), Text(2, 0, 'Decision Tree'), Text(3, 0, 'Decision Tree - CV'), Text(4, 0, 'Decision Tree - Gini - Limited Depth'), Text(5, 0, 'Decision Tree - Entropy - Limited Depth'), Text(6, 0, 'Artificial Neural Network'), Text(7, 0, 'Artificial Neural Network - CV')])
<ipython-input-38-c9fb05c09afa>:35: MatplotlibDeprecationWarning: Unable to determine Axes to steal space for Colorbar. Using gca(), but will raise in the future. Either provide the *cax* argument to use as the Axes for the Colorbar, provide the *ax* argument to steal space from it, or add *mappable* to an Axes. plt.colorbar(sm, label='Precision Score')
<matplotlib.colorbar.Colorbar at 0x7ddd520bc190>
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
! pwd
/content
%%shell
jupyter nbconvert --to html //content/Breast_Cancer_Survival_Prediction_01.ipynb
[NbConvertApp] WARNING | pattern '//content/Breast_Cancer_Survival_Prediction_01.ipynb' matched no files
This application is used to convert notebook files (*.ipynb)
to various other formats.
WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.
Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
<cmd> --help-all
--debug
set log level to logging.DEBUG (maximize logging output)
Equivalent to: [--Application.log_level=10]
--show-config
Show the application's configuration (human-readable format)
Equivalent to: [--Application.show_config=True]
--show-config-json
Show the application's configuration (json format)
Equivalent to: [--Application.show_config_json=True]
--generate-config
generate default config file
Equivalent to: [--JupyterApp.generate_config=True]
-y
Answer yes to any questions instead of prompting.
Equivalent to: [--JupyterApp.answer_yes=True]
--execute
Execute the notebook prior to export.
Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
Write notebook output to stdout instead of files.
Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
Run nbconvert in place, overwriting the existing notebook (only
relevant when converting to notebook format)
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
Clear output of current file and save in place,
overwriting the existing notebook.
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
Exclude input and output prompts from converted document.
Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
Exclude input cells and output prompts from converted document.
This mode is ideal for generating code-free reports.
Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
Whether to allow downloading chromium if no suitable version is found on the system.
Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
Disable chromium security sandbox when converting to PDF..
Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
Shows code input. This flag is only useful for dejavu users.
Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
Whether the HTML in Markdown cells and cell outputs should be sanitized..
Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
Set the log level by value or name.
Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
Default: 30
Equivalent to: [--Application.log_level]
--config=<Unicode>
Full path of a config file.
Default: ''
Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
The export format to be used, either one of the built-in formats
['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf']
or a dotted object name that represents the import path for an
``Exporter`` class
Default: ''
Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
Name of the template to use
Default: ''
Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
Name of the template file to use
Default: None
Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
as prebuilt extension for the lab template)
Default: 'light'
Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
Whether the HTML in Markdown cells and cell outputs should be sanitized.This
should be set to True by nbviewer or similar tools.
Default: False
Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
Writer class used to write the
results of the conversion
Default: 'FilesWriter'
Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
PostProcessor class used to write the
results of the conversion
Default: ''
Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
overwrite base name use for output files.
can only be used when converting one notebook at a time.
Default: ''
Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
Directory to write output(s) to. Defaults
to output to the directory of each notebook. To recover
previous default behaviour (outputting to the current
working directory) use . as the flag value.
Default: ''
Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
The URL prefix for reveal.js (version 3.x).
This defaults to the reveal CDN, but can be any url pointing to a copy
of reveal.js.
For speaker notes to work, this must be a relative path to a local
copy of reveal.js: e.g., "reveal.js".
If a relative path is given, it must be a subdirectory of the
current directory (from which the server is run).
See the usage documentation
(https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
for more details.
Default: ''
Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
The nbformat version to write.
Use this to downgrade notebooks.
Choices: any of [1, 2, 3, 4]
Default: 4
Equivalent to: [--NotebookExporter.nbformat_version]
Examples
--------
The simplest way to use nbconvert is
> jupyter nbconvert mynotebook.ipynb --to html
Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf'].
> jupyter nbconvert --to latex mynotebook.ipynb
Both HTML and LaTeX support multiple output templates. LaTeX includes
'base', 'article' and 'report'. HTML includes 'basic', 'lab' and
'classic'. You can specify the flavor of the format used.
> jupyter nbconvert --to html --template lab mynotebook.ipynb
You can also pipe the output to stdout, rather than a file
> jupyter nbconvert mynotebook.ipynb --stdout
PDF is generated via latex
> jupyter nbconvert mynotebook.ipynb --to pdf
You can get (and serve) a Reveal.js-powered slideshow
> jupyter nbconvert myslides.ipynb --to slides --post serve
Multiple notebooks can be given at the command line in a couple of
different ways:
> jupyter nbconvert notebook*.ipynb
> jupyter nbconvert notebook1.ipynb notebook2.ipynb
or you can specify the notebooks list in a config file, containing::
c.NbConvertApp.notebooks = ["my_notebook.ipynb"]
> jupyter nbconvert --config mycfg.py
To see all available configurables, use `--help-all`.
--------------------------------------------------------------------------- CalledProcessError Traceback (most recent call last) <ipython-input-41-4ad7f0406dcb> in <cell line: 1>() ----> 1 get_ipython().run_cell_magic('shell', '', 'jupyter nbconvert --to html //content/Breast_Cancer_Survival_Prediction_01.ipynb\n') /usr/local/lib/python3.10/dist-packages/google/colab/_shell.py in run_cell_magic(self, magic_name, line, cell) 332 if line and not cell: 333 cell = ' ' --> 334 return super().run_cell_magic(magic_name, line, cell) 335 336 /usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell) 2471 with self.builtin_trap: 2472 args = (magic_arg_s, cell) -> 2473 result = fn(*args, **kwargs) 2474 return result 2475 /usr/local/lib/python3.10/dist-packages/google/colab/_system_commands.py in _shell_cell_magic(args, cmd) 110 result = _run_command(cmd, clear_streamed_output=False) 111 if not parsed_args.ignore_errors: --> 112 result.check_returncode() 113 return result 114 /usr/local/lib/python3.10/dist-packages/google/colab/_system_commands.py in check_returncode(self) 135 def check_returncode(self): 136 if self.returncode: --> 137 raise subprocess.CalledProcessError( 138 returncode=self.returncode, cmd=self.args, output=self.output 139 ) CalledProcessError: Command 'jupyter nbconvert --to html //content/Breast_Cancer_Survival_Prediction_01.ipynb ' returned non-zero exit status 255.